{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "初始化spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#SparkContext\n",
    "from pyspark import SparkContext\n",
    "sc = SparkContext(master = 'local[2]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Calculations With Variables\n",
    "sc.version\n",
    "sc.pythonVer\n",
    "sc.master\n",
    "str(sc.sparkHome)\n",
    "str(sc.sparkUser())\n",
    "sc.appName\n",
    "sc.applicationId\n",
    "sc.defaultParallelism\n",
    "sc.defaultMinPartitions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Configuration\n",
    "from pyspark import SparkConf, SparkContext\n",
    "conf = (SparkConf().setMaster(\"local\").setAppName(\"My app\").set(\"spark.executor.memory\", \"1g\"))\n",
    "sc = SparkContext(conf = conf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Parallelized Collections\n",
    "rdd = sc.parallelize([('a',7),('a',2),('b',2)])\n",
    "rdd2 = sc.parallelize([('a',2),('d',1),('b',1)])\n",
    "rdd3 = sc.parallelize(range(100))\n",
    "rdd4 = sc.parallelize([(\"a\",[\"x\",\"y\",\"z\"]),(\"b\",[\"p\", \"r\"])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#External Data\n",
    "textFile = sc.textFile(\"/my/directory/*.txt\")\n",
    "textFile2 = sc.wholeTextFiles(\"/my/directory/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "选择数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Getting\n",
    "rdd.collect()  #[('a', 7), ('a', 2), ('b', 2)]\n",
    "rdd.take(2)    #[('a', 7), ('a', 2)]\n",
    "rdd.first()    #('a', 7)\n",
    "rdd.top(2)     #[('b', 2), ('a', 7)]\n",
    "\n",
    "#Sampling\n",
    "rdd3.sample(False, 0.15, 81).collect() #[3,4,27,31,40,41,42,43,60,76,79,80,86,97]\n",
    "\n",
    "#Filtering\n",
    "rdd.filter(lambda x: \"a\" in x).collect() #[('a',7),('a',2)]\n",
    "rdd5.distinct().collect()  #['a',2,'b',7]\n",
    "rdd.keys().collect() #['a', 'a', 'b']\n",
    "\n",
    "#Iterating\n",
    "def g(x): print(x)\n",
    "rdd.foreach(g)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "('a', 7)\n",
    "('b', 2)\n",
    "('a', 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "获取RDD信息：基本信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rdd.getNumPartitions()\n",
    "rdd.count()\n",
    "rdd.countByKey()\n",
    "rdd.countByValue()\n",
    "rdd.collectAsMap()\n",
    "rdd3.sum()\n",
    "sc.parallelize([]).isEmpty()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "获取RDD信息：概要信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rdd3.max()\n",
    "rdd3.min()\n",
    "rdd3.mean()\n",
    "rdd3.stdev()\n",
    "rdd3.variance()\n",
    "rdd3.histogram(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rdd.map(lambda x:x+(x[1],x[0]))\n",
    "rdd5=rdd.flatMap(lambda x:x+(x[1],x[0]))\n",
    "rdd5.collect()\n",
    "rdd4.flatMapValues(lambda x:x).collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数学操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rdd.subtracrt(rdd2)  #返回差集\n",
    "rdd2.subtractByKey(rdd)  #返回key的差集\n",
    "rdd.cartesian(rdd2).collect() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rdd2.sortBy(lambda x:x[1]).collect()\n",
    "rdd2.sortByKey()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据变形"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rdd.repartition(4)\n",
    "rddd.coalesce(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rdd.saveAsTextFile(\"rdd.txt\")\n",
    "rdd.saveAsHadoopFile(\"hdfs://xxxxx.\",'org.apache.hadoop.mapred.TextOutputFormat')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "停止"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "执行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "$ ./bin/spark-submit examples/src/main/python/pi.py"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
